import fitz
import re
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame
#设置显示的最大列、宽等参数,消掉打印不完全中间的省略号
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
def parse_data_line(subtext):
subp = '([0-9,.%\- ]*?)\n'
psub = '%s%s%s%s' % (subp,subp,subp,subp)
p = re.compile('(?<=\\n)(\D+\n)+%s'% psub)
lines = p.findall(subtext)
lines = [(re.sub('\s', '', l[0]), l[1], l[2], l[3], l[4]) for l in lines]
return lines
def parse_data_title(subtext):
p = re.compile('(?<=否)\s*\n(.*?)(?=\\n营业收入)', re.DOTALL)
title = p.search(subtext)
if title is None:
Warning('主要会计数据与财务指标,标题行没有匹配成功')
else:
title = title.group().strip().split('\n')
title = [re.sub('\s', '', t) for t in title]
return title
def parse_accounting_financial_data(text):
title = parse_data_title(text)
lines = parse_data_line(text)
df = pd.DataFrame({
'item': [l[0] for l in lines],
'%s' % title[0]: [l[1] for l in lines],
'%s' % title[1]: [l[2] for l in lines],
'%s' % title[2]: [l[3] for l in lines],
'%s' % title[3]: [l[4] for l in lines],
})
return df
#紫鑫药业
zixin = []
doc = fitz.open('紫鑫药业:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
zixin.append(df.iloc[:,4][1])
zixin.append(df.iloc[:,2][1])
zixin.append(df.iloc[:,1][1])
doc = fitz.open('紫鑫药业:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
zixin.append(df.iloc[:,1][1])
doc = fitz.open('紫鑫药业:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]+1
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
zixin.append(df.iloc[:,1][1])
zixin = [float(x.replace(',','')) for x in zixin]
time = ['2016', '2017', '2018', '2019', '2020']
plt.figure()
plt.plot(time, zixin, 'r-')
plt.xlabel('年度')
plt.ylabel('归属于股东的净利润')
plt.title('紫鑫药业净利润')
plt.show()
#以岭药业
yiling = []
doc = fitz.open('以岭药业:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
yiling.append(df.iloc[:,4][1])
yiling.append(df.iloc[:,2][1])
yiling.append(df.iloc[:,1][1])
doc = fitz.open('以岭药业:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
yiling.append(df.iloc[:,1][1])
doc = fitz.open('以岭药业:2020年年度报告.pdf')
page_number = 6
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
yiling.append(df.iloc[:,1][1])
yiling = [float(x.replace(',','')) for x in yiling]
plt.figure()
plt.plot(time, yiling, 'r-')
plt.xlabel('年度')
plt.ylabel('归属于股东的净利润')
plt.title('以岭药业净利润')
plt.show()
#新天药业
xintian = []
doc = fitz.open('新天药业:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
xintian.append(df.iloc[:,4][1])
xintian.append(df.iloc[:,2][1])
xintian.append(df.iloc[:,1][1])
doc = fitz.open('新天药业:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
xintian.append(df.iloc[:,1][1])
doc = fitz.open('新天药业:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
xintian.append(df.iloc[:,1][1])
xintian = [float(x.replace(',','')) for x in xintian]
plt.figure()
plt.plot(time, xintian, 'r-')
plt.xlabel('年度')
plt.ylabel('归属于股东的净利润')
plt.title('新天药业净利润')
plt.show()
#信邦制药
xinbang = []
doc = fitz.open('信邦制药:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
xinbang.append(df.iloc[:,4][1])
xinbang.append(df.iloc[:,2][1])
xinbang.append(df.iloc[:,1][1])
doc = fitz.open('信邦制药:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
xinbang.append(df.iloc[:,1][1])
doc = fitz.open('信邦制药:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
xinbang.append(df.iloc[:,1][1])
xinbang = [float(x.replace(',','')) for x in xinbang]
plt.figure()
plt.plot(time, xinbang, 'r-')
plt.xlabel('年度')
plt.ylabel('归属于股东的净利润')
plt.title('信邦制药净利润')
plt.show()
#沃华医药
wohua = []
doc = fitz.open('沃华医药:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
wohua.append(df.iloc[:,4][1])
wohua.append(df.iloc[:,2][1])
wohua.append(df.iloc[:,1][1])
doc = fitz.open('沃华医药:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
wohua.append(df.iloc[:,1][1])
doc = fitz.open('沃华医药:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
wohua.append(df.iloc[:,1][1])
wohua = [float(x.replace(',','')) for x in wohua]
plt.figure()
plt.plot(time, wohua, 'r-')
plt.xlabel('年度')
plt.ylabel('归属于股东的净利润')
plt.title('沃华医药净利润')
plt.show()
#瑞康医药
ruikang = []
doc = fitz.open('瑞康医药:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
ruikang.append(df.iloc[:,4][1])
ruikang.append(df.iloc[:,2][1])
ruikang.append(df.iloc[:,1][1])
doc = fitz.open('瑞康医药:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
ruikang.append(df.iloc[:,1][1])
doc = fitz.open('瑞康医药:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
ruikang.append(df.iloc[:,1][1])
ruikang = [float(x.replace(',','')) for x in ruikang]
plt.figure()
plt.plot(time, ruikang, 'r-')
plt.xlabel('年度')
plt.ylabel('归属于股东的净利润')
plt.title('瑞康医药净利润')
plt.show()
#奇正藏药
qizheng = []
doc = fitz.open('奇正藏药:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
qizheng.append(df.iloc[:,4][1])
qizheng.append(df.iloc[:,2][1])
qizheng.append(df.iloc[:,1][1])
doc = fitz.open('奇正藏药:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
qizheng.append(df.iloc[:,1][1])
doc = fitz.open('奇正藏药:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
qizheng.append(df.iloc[:,1][1])
qizheng = [float(x.replace(',','')) for x in qizheng]
plt.figure()
plt.plot(time, qizheng, 'r-')
plt.xlabel('年度')
plt.ylabel('EPS')
plt.title('奇正藏药净利润')
plt.show()
#华森制药
huasen = []
doc = fitz.open('华森制药:2018年年度报告.pdf')
page_number = 7
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
huasen.append(df.iloc[:,4][1])
huasen.append(df.iloc[:,2][1])
huasen.append(df.iloc[:,1][1])
doc = fitz.open('华森制药:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
huasen.append(df.iloc[:,1][1])
doc = fitz.open('华森制药:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
huasen.append(df.iloc[:,1][1])
huasen = [float(x.replace(',','')) for x in huasen]
plt.figure()
plt.plot(time, huasen, 'r-')
plt.xlabel('年度')
plt.ylabel('归属于股东的净利润')
plt.title('华森制药净利润')
plt.show()
#汉森制药
hansen = []
doc = fitz.open('汉森制药:2018年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
hansen.append(df.iloc[:,4][1])
hansen.append(df.iloc[:,2][1])
hansen.append(df.iloc[:,1][1])
doc = fitz.open('汉森制药:2019年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
hansen.append(df.iloc[:,1][1])
doc = fitz.open('汉森制药:2020年年度报告.pdf')
toc = doc.get_toc()
page_number = toc[7][2]
page7 = doc.load_page(page_number-1)
text = page7.getText()
df = parse_accounting_financial_data(text)
hansen.append(df.iloc[:,1][1])